import pandas as pd
import numpy as np
import json
import plotly.express as px
The NYC neighborhood map as a geojson file is taken from:
https://data.cityofnewyork.us/City-Government/Neighborhood-Tabulation-Areas-NTA-/cpf4-rkhq
nycmap = json.load(open("nyc_neighborhoods.geojson"))
Property sales data (Nov 2019 - Oct 2020) is taken from the NYC Department of Finance:
https://www1.nyc.gov/site/finance/taxes/property-rolling-sales-data.page
# Read in 5 xls files into df
boroughs = ['manhattan', 'bronx', 'brooklyn', 'queens', 'statenisland']
df = pd.DataFrame([])
for borough in boroughs:
df = pd.concat([df, pd.read_excel('rollingsales_' + borough + '.xls',
header=4, usecols = ['BOROUGH', 'BLOCK', 'LOT', 'NEIGHBORHOOD',
'TAX CLASS AT PRESENT', 'SALE PRICE', 'SALE DATE'])])
# Rename columns
df.columns = ['Borough', 'Neighborhood', 'Tax Class', 'Block', 'Lot', 'Sale Price', 'Date']
df.head()
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 377 | 72 | 0 | 2020-10-02 |
| 1 | 1 | ALPHABET CITY | 1 | 377 | 66 | 4350000 | 2020-07-16 |
| 2 | 1 | ALPHABET CITY | 2A | 376 | 23 | 600000 | 2019-12-12 |
| 3 | 1 | ALPHABET CITY | 2 | 376 | 28 | 1 | 2020-06-26 |
| 4 | 1 | ALPHABET CITY | 2 | 376 | 31 | 290000 | 2020-02-07 |
The correlation between the borough-block-lot (bbl) number and the 2010 census tract is taken from the NYC Pluto data downloader:
https://chriswhong.github.io/plutoplus
pluto = pd.read_csv('bbl_tract.csv', dtype=str)
pluto = pluto.dropna()
pluto.head()
| bbl | borocode | ct2010 | |
|---|---|---|---|
| 0 | 3049590067 | 3 | 838 |
| 1 | 3018910060 | 3 | 191 |
| 2 | 3018910061 | 3 | 191 |
| 3 | 2044370024 | 2 | 328 |
| 4 | 3018910063 | 3 | 191 |
Finally, the correlation between the 2010 census tract to the 2010 Neighborhood Tabulation Area (NTA) is taken from the NYC Department of City Planing:
https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-nynta.page
census = pd.read_excel('tract_nta.xlsx', header=3, usecols =
['2010 NYC Borough Code', '2010 Census Tract',
'Neighborhood Tabulation Area (NTA)', 'Unnamed: 6'],
dtype=str).drop(0)
# Rename columns
census.columns = ['Borough Code', '2010 Tract', 'NTA Code', 'NTA Name']
census.head()
| Borough Code | 2010 Tract | NTA Code | NTA Name | |
|---|---|---|---|---|
| 1 | 2 | 031000 | BX31 | Allerton-Pelham Gardens |
| 2 | 2 | 031200 | BX31 | Allerton-Pelham Gardens |
| 3 | 2 | 031400 | BX31 | Allerton-Pelham Gardens |
| 4 | 2 | 031600 | BX31 | Allerton-Pelham Gardens |
| 5 | 2 | 031800 | BX31 | Allerton-Pelham Gardens |
df.head()
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 377 | 72 | 0 | 2020-10-02 |
| 1 | 1 | ALPHABET CITY | 1 | 377 | 66 | 4350000 | 2020-07-16 |
| 2 | 1 | ALPHABET CITY | 2A | 376 | 23 | 600000 | 2019-12-12 |
| 3 | 1 | ALPHABET CITY | 2 | 376 | 28 | 1 | 2020-06-26 |
| 4 | 1 | ALPHABET CITY | 2 | 376 | 31 | 290000 | 2020-02-07 |
Let's first clean up df. We see that there are some extremely low sale prices, indicating transfers of ownerships with little cash considerations. These transfers should not be considered in this analysis.
Here, I consider only the rows with a sale price > 100,000.
df = df[df['Sale Price'] > 100000]
df.reset_index(drop=True, inplace=True)
df.head()
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 377 | 66 | 4350000 | 2020-07-16 |
| 1 | 1 | ALPHABET CITY | 2A | 376 | 23 | 600000 | 2019-12-12 |
| 2 | 1 | ALPHABET CITY | 2 | 376 | 31 | 290000 | 2020-02-07 |
| 3 | 1 | ALPHABET CITY | 2 | 376 | 31 | 1269168 | 2019-12-19 |
| 4 | 1 | ALPHABET CITY | 2 | 376 | 31 | 125375 | 2020-03-13 |
Next, I noticed that there are duplicates of sale prices, typically on the same date, in the same neighborhood, and in the same block or nearby. It looks like multiple properties are traded at the same time, and the sale price listed after each property is likely the total sale price, rather than the individual sale price.
df[df.duplicated(subset=['Borough', 'Neighborhood',
'Sale Price', 'Date'], keep=False)].head(15)
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 7 | 1 | ALPHABET CITY | 2B | 384 | 6 | 1250000 | 2020-06-16 |
| 8 | 1 | ALPHABET CITY | 2B | 384 | 7 | 1250000 | 2020-06-16 |
| 62 | 1 | ALPHABET CITY | 2 | 384 | 1408 | 1630000 | 2020-10-26 |
| 67 | 1 | ALPHABET CITY | 2 | 392 | 1121 | 1350000 | 2020-01-24 |
| 85 | 1 | ALPHABET CITY | 4 | 384 | 1404 | 1630000 | 2020-10-26 |
| 86 | 1 | ALPHABET CITY | 4 | 392 | 1101 | 1350000 | 2020-01-24 |
| 163 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 164 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 165 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 166 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 167 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 168 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 169 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 170 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
| 171 | 1 | CHELSEA | 2 | 766 | 9 | 4579002 | 2020-01-02 |
We see that there are various number of properties that are traded at the same time. Large number of trades may be done not by regular home-buyers, but investors. Here, to make this analysis more relevant to regular home-buyers, I decide to drop any rows that have a duplicate more than three.
mask = df.groupby(['Borough', 'Neighborhood',
'Sale Price', 'Date'])['Block'].transform('count') <= 3
df = df[mask].reset_index(drop=True)
# Create a dataframe containing all remaining duplicates
duplicates = df[df.duplicated(subset=['Borough', 'Neighborhood',
'Sale Price', 'Date'], keep=False)]
duplicates.head(15)
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 7 | 1 | ALPHABET CITY | 2B | 384 | 6 | 1250000 | 2020-06-16 |
| 8 | 1 | ALPHABET CITY | 2B | 384 | 7 | 1250000 | 2020-06-16 |
| 62 | 1 | ALPHABET CITY | 2 | 384 | 1408 | 1630000 | 2020-10-26 |
| 67 | 1 | ALPHABET CITY | 2 | 392 | 1121 | 1350000 | 2020-01-24 |
| 85 | 1 | ALPHABET CITY | 4 | 384 | 1404 | 1630000 | 2020-10-26 |
| 86 | 1 | ALPHABET CITY | 4 | 392 | 1101 | 1350000 | 2020-01-24 |
| 280 | 1 | CHELSEA | 2 | 723 | 1028 | 3230000 | 2020-03-10 |
| 281 | 1 | CHELSEA | 2 | 723 | 1048 | 4820000 | 2019-12-11 |
| 294 | 1 | CHELSEA | 2 | 766 | 1431 | 4400000 | 2019-12-10 |
| 302 | 1 | CHELSEA | 2 | 768 | 1002 | 950000 | 2019-11-08 |
| 303 | 1 | CHELSEA | 2 | 768 | 1015 | 950000 | 2019-11-08 |
| 327 | 1 | CHELSEA | 2 | 794 | 1114 | 2280000 | 2020-01-09 |
| 335 | 1 | CHELSEA | 2 | 796 | 1214 | 5450000 | 2019-11-14 |
| 336 | 1 | CHELSEA | 2 | 796 | 1215 | 5450000 | 2019-11-14 |
| 379 | 1 | CHELSEA | 2 | 718 | 1004 | 4400000 | 2019-12-10 |
Now there are at most three duplicates. However, the sale prices still reflect the total price, rather than the individual prices. Next, I divide the total price by the number of properties traded to calculate the individual price.
# Make two of the Sale Price column because one is used for groupby, and the other
# for calculating average
duplicates.insert(7, 'Avg Sale Price', duplicates['Sale Price'])
avg_sale = duplicates.groupby(['Borough', 'Neighborhood',
'Sale Price', 'Date'])['Avg Sale Price'].transform(lambda c: c/len(c))
avg_sale.head(10)
7 625000.0 8 625000.0 62 815000.0 67 675000.0 85 815000.0 86 675000.0 280 1615000.0 281 2410000.0 294 2200000.0 302 475000.0 Name: Avg Sale Price, dtype: float64
# Insert the average price back to the dataframe
duplicates = duplicates.drop(['Sale Price', 'Avg Sale Price'], axis=1)
duplicates.insert(5, 'Sale Price', avg_sale)
duplicates.head(10)
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 7 | 1 | ALPHABET CITY | 2B | 384 | 6 | 625000.0 | 2020-06-16 |
| 8 | 1 | ALPHABET CITY | 2B | 384 | 7 | 625000.0 | 2020-06-16 |
| 62 | 1 | ALPHABET CITY | 2 | 384 | 1408 | 815000.0 | 2020-10-26 |
| 67 | 1 | ALPHABET CITY | 2 | 392 | 1121 | 675000.0 | 2020-01-24 |
| 85 | 1 | ALPHABET CITY | 4 | 384 | 1404 | 815000.0 | 2020-10-26 |
| 86 | 1 | ALPHABET CITY | 4 | 392 | 1101 | 675000.0 | 2020-01-24 |
| 280 | 1 | CHELSEA | 2 | 723 | 1028 | 1615000.0 | 2020-03-10 |
| 281 | 1 | CHELSEA | 2 | 723 | 1048 | 2410000.0 | 2019-12-11 |
| 294 | 1 | CHELSEA | 2 | 766 | 1431 | 2200000.0 | 2019-12-10 |
| 302 | 1 | CHELSEA | 2 | 768 | 1002 | 475000.0 | 2019-11-08 |
### Replace duplicate sale prices in df with average prices
for i in duplicates.index:
df.loc[i, 'Sale Price'] = duplicates.loc[i, 'Sale Price']
df.head(10)
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 377 | 66 | 4350000.0 | 2020-07-16 |
| 1 | 1 | ALPHABET CITY | 2A | 376 | 23 | 600000.0 | 2019-12-12 |
| 2 | 1 | ALPHABET CITY | 2 | 376 | 31 | 290000.0 | 2020-02-07 |
| 3 | 1 | ALPHABET CITY | 2 | 376 | 31 | 1269168.0 | 2019-12-19 |
| 4 | 1 | ALPHABET CITY | 2 | 376 | 31 | 125375.0 | 2020-03-13 |
| 5 | 1 | ALPHABET CITY | 2 | 376 | 41 | 585000.0 | 2020-02-06 |
| 6 | 1 | ALPHABET CITY | 2 | 376 | 54 | 249508.0 | 2020-06-03 |
| 7 | 1 | ALPHABET CITY | 2B | 384 | 6 | 625000.0 | 2020-06-16 |
| 8 | 1 | ALPHABET CITY | 2B | 384 | 7 | 625000.0 | 2020-06-16 |
| 9 | 1 | ALPHABET CITY | 2 | 390 | 50 | 220000.0 | 2019-11-05 |
Next, we are going to select only the tax classes 1 and 2, which correspond to residential housings.
# Remove subclass following the digit
df['Tax Class'] = df['Tax Class'].map(lambda c: str(c)[0])
### Keep only classes 1 and 2
df = df[(df['Tax Class'] == '1') | (df['Tax Class'] == '2')]
df.head()
| Borough | Neighborhood | Tax Class | Block | Lot | Sale Price | Date | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 377 | 66 | 4350000.0 | 2020-07-16 |
| 1 | 1 | ALPHABET CITY | 2 | 376 | 23 | 600000.0 | 2019-12-12 |
| 2 | 1 | ALPHABET CITY | 2 | 376 | 31 | 290000.0 | 2020-02-07 |
| 3 | 1 | ALPHABET CITY | 2 | 376 | 31 | 1269168.0 | 2019-12-19 |
| 4 | 1 | ALPHABET CITY | 2 | 376 | 31 | 125375.0 | 2020-03-13 |
The next task is to map the properties to the geojson map. The key used in the geojson file here is the NYC NTA (Neighborhood Tabulation Area) code, or NTA name.
To find out which NTA each property corresponds to, I chose to use the bbl (borough-block-lot) number to find the corresponding 2010 census tract, and then find the corresponding NTA.
A NYC bbl number uniquely identifies the location of a property, so the mapping method outlined above should find the NTA of each property. However, I noticed that over the years there are restructures of some lot mumbers; two-digit lot numbers are split up to multiple four-digit lot numbers. The removal and addition of lot numbers makes it tricky to map using the whole bbl.
Therefore, istead of using the whole bbl, I use only the borough-block number, because we actually don't need the precision of bbl for this analysis.
df['bbl'] = df.apply(lambda row:
str(row['Borough']) + str(row['Block']).rjust(5, '0'),
axis = 1).astype(str)
df = df.drop(['Block', 'Lot', 'Date'], axis=1)
df.head()
| Borough | Neighborhood | Tax Class | Sale Price | bbl | |
|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 1 | 4350000.0 | 100377 |
| 1 | 1 | ALPHABET CITY | 2 | 600000.0 | 100376 |
| 2 | 1 | ALPHABET CITY | 2 | 290000.0 | 100376 |
| 3 | 1 | ALPHABET CITY | 2 | 1269168.0 | 100376 |
| 4 | 1 | ALPHABET CITY | 2 | 125375.0 | 100376 |
# Drop lot numbers and drop duplicates
pluto['bbl'] = pluto['bbl'].map(lambda s: s[:6])
pluto = pluto.drop_duplicates()
pluto.head()
| bbl | borocode | ct2010 | |
|---|---|---|---|
| 0 | 304959 | 3 | 838 |
| 1 | 301891 | 3 | 191 |
| 3 | 204437 | 2 | 328 |
| 6 | 306533 | 3 | 452 |
| 8 | 304960 | 3 | 838 |
The format of 2010 census tract is different in pluto and in census. In census, it is always 6 digits, whereas in pluto it is 2 to 4 digits, and sometimes with decimal points.
Here's the rule of conversion:
For example, 838 becomes 083800, 1502 becomes 150200, 798.02 becomes 079802, and so on.
pluto['ct2010'] = pluto['ct2010'].map(lambda n: n.split('.')[0] + '00'
if len(n.split('.')) == 1
else n.split('.')[0] + n.split('.')[1])
pluto['ct2010'] = pluto['ct2010'].map(lambda n: n.rjust(6, '0'))
pluto.head()
| bbl | borocode | ct2010 | |
|---|---|---|---|
| 0 | 304959 | 3 | 083800 |
| 1 | 301891 | 3 | 019100 |
| 3 | 204437 | 2 | 032800 |
| 6 | 306533 | 3 | 045200 |
| 8 | 304960 | 3 | 083800 |
Although the census tract number is unique within the neighborhoods, it is not unique throughout NYC. I decided to create a column named 'Identifier' with a format of (borough code, census tract), which is unique in NYC.
pluto['Identifier'] = pluto.apply(lambda row: (row['borocode'], row['ct2010']), axis=1)
pluto = pluto.drop(['borocode', 'ct2010'], axis=1)
pluto.head()
| bbl | Identifier | |
|---|---|---|
| 0 | 304959 | (3, 083800) |
| 1 | 301891 | (3, 019100) |
| 3 | 204437 | (2, 032800) |
| 6 | 306533 | (3, 045200) |
| 8 | 304960 | (3, 083800) |
census: 2010 census tract to NTA¶The task remaining is simple. Here I just need to add a column named 'Identifier', similar to what I did in 2.2.3.
census['Identifier'] = census.apply(lambda row: (row['Borough Code'],
row['2010 Tract']), axis=1)
census = census.drop(['Borough Code', '2010 Tract'], axis = 1)
census.head()
| NTA Code | NTA Name | Identifier | |
|---|---|---|---|
| 1 | BX31 | Allerton-Pelham Gardens | (2, 031000) |
| 2 | BX31 | Allerton-Pelham Gardens | (2, 031200) |
| 3 | BX31 | Allerton-Pelham Gardens | (2, 031400) |
| 4 | BX31 | Allerton-Pelham Gardens | (2, 031600) |
| 5 | BX31 | Allerton-Pelham Gardens | (2, 031800) |
### Merge dataframes
df = pd.merge(df, pluto, how='left', on='bbl')
df = pd.merge(df, census, how='left', on='Identifier')
# Create a dataframe containing the median sale prices in each NTA
sale_price = df.groupby(['NTA Code', 'NTA Name']).agg(
{'Sale Price':['median', 'count'], 'Borough': 'median'}).reset_index()
sale_price.columns = ['NTA Code', 'NTA Name', 'Sale Price', 'Count', 'Borough']
# Keep the NTA's with more than 10 sales
sale_price = sale_price[sale_price['Count']>10]
sale_price.head()
| NTA Code | NTA Name | Sale Price | Count | Borough | |
|---|---|---|---|---|---|
| 0 | BK09 | Brooklyn Heights-Cobble Hill | 1200000.0 | 236 | 3 |
| 1 | BK17 | Sheepshead Bay-Gerritsen Beach-Manhattan Beach | 465000.0 | 463 | 3 |
| 2 | BK19 | Brighton Beach | 425000.0 | 156 | 3 |
| 3 | BK21 | Seagate-Coney Island | 630000.0 | 53 | 3 |
| 4 | BK23 | West Brighton | 385000.0 | 87 | 3 |
fig = px.choropleth_mapbox(sale_price,
geojson=nycmap,
locations="NTA Code",
featureidkey="properties.ntacode",
color="Sale Price",
color_continuous_scale="matter",
mapbox_style="carto-positron",
zoom=9, center={"lat": 40.7, "lon": -73.9},
opacity=0.7,
hover_name="NTA Name"
)
fig.show()
# 20 most expensive neighborhoods in NYC
sale_price.sort_values('Sale Price', ascending=False).head(20)
| NTA Code | NTA Name | Sale Price | Count | Borough | |
|---|---|---|---|---|---|
| 104 | MN24 | SoHo-TriBeCa-Civic Center-Little Italy | 2703500.0 | 571 | 1 |
| 98 | MN17 | Midtown-Midtown South | 1917500.0 | 352 | 1 |
| 96 | MN14 | Lincoln Square | 1900000.0 | 1074 | 1 |
| 114 | MN40 | Upper East Side-Carnegie Hill | 1707500.0 | 682 | 1 |
| 13 | BK33 | Carroll Gardens-Columbia Street-Red Hook | 1525000.0 | 272 | 3 |
| 95 | MN13 | Hudson Yards-Chelsea-Flat Iron-Union Square | 1450000.0 | 529 | 1 |
| 30 | BK64 | Prospect Heights | 1225000.0 | 261 | 3 |
| 0 | BK09 | Brooklyn Heights-Cobble Hill | 1200000.0 | 236 | 3 |
| 36 | BK76 | Greenpoint | 1195000.0 | 153 | 3 |
| 102 | MN22 | East Village | 1181250.0 | 142 | 1 |
| 103 | MN23 | West Village | 1175000.0 | 711 | 1 |
| 16 | BK37 | Park Slope-Gowanus | 1165356.0 | 637 | 3 |
| 31 | BK68 | Fort Greene | 1152464.0 | 149 | 3 |
| 34 | BK73 | North Side-South Side | 1150000.0 | 294 | 3 |
| 94 | MN12 | Upper West Side | 1099000.0 | 786 | 1 |
| 45 | BK90 | East Williamsburg | 1024999.5 | 158 | 3 |
| 106 | MN27 | Chinatown | 1020000.0 | 87 | 1 |
| 14 | BK34 | Sunset Park East | 1004000.0 | 123 | 3 |
| 170 | QN72 | Steinway | 999990.0 | 127 | 4 |
| 15 | BK35 | Stuyvesant Heights | 998000.0 | 215 | 3 |